library(data.table)
library(ggplot2)
library(plotly)
library(dplyr)
library(imbalance)
library(class)
My Goal is to detect credit cart Fraud Problem is Class imbalance Over sampling -Majority Weighted Minority Oversampling TEchnique Under sampling SMOTE Testing
df <- fread('creditcard.csv',header=T)
df$Class <- factor(df$Class)
head(df)
summary(df$Class)
0 1
284315 492
is.na(df) %>% any
[1] FALSE
df$Time<- df$Time %>% scale() %>% as.data.frame()
df$Amount<- df$Amount %>% scale() %>% as.data.frame()
str(df)
Classes ‘data.table’ and 'data.frame': 284807 obs. of 31 variables:
$ Time : num -2 -2 -2 -2 -2 ...
$ V1 : num -1.36 1.192 -1.358 -0.966 -1.158 ...
$ V2 : num -0.0728 0.2662 -1.3402 -0.1852 0.8777 ...
$ V3 : num 2.536 0.166 1.773 1.793 1.549 ...
$ V4 : num 1.378 0.448 0.38 -0.863 0.403 ...
$ V5 : num -0.3383 0.06 -0.5032 -0.0103 -0.4072 ...
$ V6 : num 0.4624 -0.0824 1.8005 1.2472 0.0959 ...
$ V7 : num 0.2396 -0.0788 0.7915 0.2376 0.5929 ...
$ V8 : num 0.0987 0.0851 0.2477 0.3774 -0.2705 ...
$ V9 : num 0.364 -0.255 -1.515 -1.387 0.818 ...
$ V10 : num 0.0908 -0.167 0.2076 -0.055 0.7531 ...
$ V11 : num -0.552 1.613 0.625 -0.226 -0.823 ...
$ V12 : num -0.6178 1.0652 0.0661 0.1782 0.5382 ...
$ V13 : num -0.991 0.489 0.717 0.508 1.346 ...
$ V14 : num -0.311 -0.144 -0.166 -0.288 -1.12 ...
$ V15 : num 1.468 0.636 2.346 -0.631 0.175 ...
$ V16 : num -0.47 0.464 -2.89 -1.06 -0.451 ...
$ V17 : num 0.208 -0.115 1.11 -0.684 -0.237 ...
$ V18 : num 0.0258 -0.1834 -0.1214 1.9658 -0.0382 ...
$ V19 : num 0.404 -0.146 -2.262 -1.233 0.803 ...
$ V20 : num 0.2514 -0.0691 0.525 -0.208 0.4085 ...
$ V21 : num -0.01831 -0.22578 0.248 -0.1083 -0.00943 ...
$ V22 : num 0.27784 -0.63867 0.77168 0.00527 0.79828 ...
$ V23 : num -0.11 0.101 0.909 -0.19 -0.137 ...
$ V24 : num 0.0669 -0.3398 -0.6893 -1.1756 0.1413 ...
$ V25 : num 0.129 0.167 -0.328 0.647 -0.206 ...
$ V26 : num -0.189 0.126 -0.139 -0.222 0.502 ...
$ V27 : num 0.13356 -0.00898 -0.05535 0.06272 0.21942 ...
$ V28 : num -0.0211 0.0147 -0.0598 0.0615 0.2152 ...
$ Amount: num 0.245 -0.3425 1.1607 0.1405 -0.0734 ...
$ Class : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
- attr(*, ".internal.selfref")=<externalptr>
head(df)
rbind: combines data frame by rows.
No Fraud Fraud
199020 199020
[1] 12
library(unbalanced)
data(ubIonosphere)
n<-ncol(ubIonosphere)
library(unbalanced)
df.ubUnder<-ubUnder(X=df[, -31], Y=df$Class, perc = 50, method = "percPos")
newData<-cbind(df.ubUnder$X, Class = df.ubUnder$Y)
newData
summary(newData$Class)
0 1
492 492
set.seed(1234)
train.id <- caTools::sample.split(newData$Class, SplitRatio = 0.70)
newData.train <- subset(newData, train.id)
newData.validate <- subset(newData, !train.id)
newData.train.class <- newData$Class[train.id]
newData.validate.class <- newData$Class[!train.id]
find.optimum.k <- function(k) {
predictied <- knn(newData.train[, 1:30], newData.validate[, 1:30], newData.train.class, k = k)
confusion.table <- table(predictied, newData.validate.class)
confusion.matrix <- data.frame(pred_Y = c(confusion.table[2, 2], confusion.table[2, 1]),
pred_N = c(confusion.table[1, 2], confusion.table[1, 1]),
row.names = c("Fraud", "No Fraud"))
Accuracy <- (confusion.matrix[1, 1] + confusion.matrix[2 ,2])/ sum(confusion.matrix)
# Balanced.Accuracy <- ((confusion.matrix[1, 1])/confusion.matrix[1, 1] + confusion.matrix[2, 1]) + ((confusion.matrix[2, 2])/confusion.matrix[1, 2] + confusion.matrix[2, 2]))/2
# F1Score <- 2*confusion.matrix[1, 1]/((2*confusion.matrix[1, 1]) + (confusion.matrix[2, 1]) + (confusion.matrix[1, 2]))
Sensitivity <- confusion.matrix[1, 1]/sum(confusion.matrix[, 1])
Specificity <- confusion.matrix[2, 2]/sum(confusion.matrix[, 2])
df.perf <- c(Accuracy=Accuracy, Sensitivity=Sensitivity, Specificity=Specificity)
# Precision <- confusion.matrix[1, 1]/(confusion.matrix[1, 1] + confusion.matrix[2, 1])
# NegativePedictiveValue <- confusion.matrix[1, 2]/(confusion.matrix[1, 2] + confusion.matrix[2, 2])
# FallOut <- confusion.matrix[2, 1]/(confusion.matrix[1, 2] + confusion.matrix[2, 2])
# print(k)
return(df.perf)
}
vec.k <- 1: 20
results <- sapply(vec.k, find.optimum.k)
results <- apply(results,1,unlist)
results <- as.data.frame(results)
results$k <- vec.k
pl <- ggplot(data=results)+geom_line(aes(x=k,y=Accuracy),size=1,color="red")+
geom_line(aes(x=k,y=Sensitivity),size=1,color="blue")+
geom_line(aes(x=k,y=Specificity),size=1,color="green")+
ylab('performance')+
theme_bw()
library(plotly)
ggplotly(pl)
NA
NA
newData.validate.class
predictied 0 1
0 140 15
1 8 133
logistic.model = glm(Class ~., family = binomial, data = newData.train)
glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic.model)
Call:
glm(formula = Class ~ ., family = binomial, data = newData.train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.4321 -0.1517 0.0000 0.0000 3.1347
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.20140 10.64088 -0.301 0.764
Time 0.07075 0.43061 0.164 0.869
V1 -2.50412 23.71586 -0.106 0.916
V2 33.73531 152.15264 0.222 0.825
V3 -26.00857 60.49761 -0.430 0.667
V4 19.61983 48.62975 0.403 0.687
V5 -9.78592 12.04276 -0.813 0.416
V6 -16.83303 69.15649 -0.243 0.808
V7 -63.92582 238.74546 -0.268 0.789
V8 12.63729 41.30687 0.306 0.760
V9 -24.38046 73.20700 -0.333 0.739
V10 -56.13230 168.24894 -0.334 0.739
V11 42.71157 142.33152 0.300 0.764
V12 -76.32411 255.61469 -0.299 0.765
V13 -0.60973 6.81015 -0.090 0.929
V14 -82.27881 278.83220 -0.295 0.768
V15 -2.72018 9.96380 -0.273 0.785
V16 -72.72521 246.07846 -0.296 0.768
V17 -128.32083 432.01053 -0.297 0.766
V18 -48.80905 165.07732 -0.296 0.767
V19 19.23932 68.36637 0.281 0.778
V20 -8.27774 46.82875 -0.177 0.860
V21 7.45454 15.14105 0.492 0.622
V22 6.12643 30.52881 0.201 0.841
V23 16.01833 91.45287 0.175 0.861
V24 -1.73290 8.72275 -0.199 0.843
V25 8.61549 41.64026 0.207 0.836
V26 0.87876 10.51042 0.084 0.933
V27 6.89893 32.92813 0.210 0.834
V28 20.81647 110.84703 0.188 0.851
Amount 49.53423 264.32338 0.187 0.851
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 953.77 on 687 degrees of freedom
Residual deviance: 131.48 on 657 degrees of freedom
AIC: 193.48
Number of Fisher Scoring iterations: 25
pred <- predict(logistic.model, newdata = newData.validate, type = "response")
perf.table <- table(newData.validate$Class, pred>.1)
conf.mat <- data.frame(pred_T=c(perf.table[2,2],perf.table[1,2]), pred_F=c(perf.table[2,1],perf.table[1,1]), row.names = c("act_T","act_F"))
ACC <- (conf.mat[1,1]+conf.mat[2,2])/sum(conf.mat)
Sensitivity <- conf.mat[1,1] /sum(conf.mat[,1])
Specificity <- conf.mat[2,2] / (conf.mat[2,1]+conf.mat[2,2])
df.perf <- data.frame(ACC,Sensitivity,Specificity)
df.perf
library(MASS)
step <- stepAIC(logistic.model, direction="both", trace=T)
Start: AIC=193.48
Class ~ Time + V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 +
V11 + V12 + V13 + V14 + V15 + V16 + V17 + V18 + V19 + V20 +
V21 + V22 + V23 + V24 + V25 + V26 + V27 + V28 + Amount
glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- V1 1 131.5 191.5
- V5 1 131.5 191.5
- V26 1 131.5 191.5
- V13 1 131.5 191.5
- Time 1 131.5 191.5
- V20 1 131.9 191.9
- V27 1 132.2 192.2
- V23 1 132.5 192.5
- V24 1 133.0 193.0
- Amount 1 133.4 193.4
<none> 131.5 193.5
- V28 1 134.7 194.7
- V8 1 136.1 196.1
- V7 1 137.9 197.9
- V15 1 138.1 198.1
- V9 1 138.9 198.9
- V6 1 139.5 199.5
- V19 1 140.8 200.8
- V18 1 141.1 201.1
- V17 1 141.6 201.6
- V16 1 144.6 204.6
- V10 1 144.6 204.6
- V12 1 145.9 205.9
- V11 1 148.1 208.1
- V14 1 172.9 232.9
- V2 1 2451.0 2511.0
- V3 1 2523.1 2583.1
- V4 1 3243.9 3303.9
- V21 1 3316.0 3376.0
- V25 1 4181.1 4241.1
- V22 1 20784.9 20844.9
glm.fit: fitted probabilities numerically 0 or 1 occurred
Step: AIC=191.48
Class ~ Time + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 +
V11 + V12 + V13 + V14 + V15 + V16 + V17 + V18 + V19 + V20 +
V21 + V22 + V23 + V24 + V25 + V26 + V27 + V28 + Amount
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- Time 1 131.5 189.5
- V27 1 132.3 190.3
- V13 1 132.4 190.4
<none> 131.5 191.5
- V24 1 133.7 191.7
- V28 1 134.9 192.9
+ V1 1 131.5 193.5
- V23 1 136.0 194.0
- V22 1 136.1 194.1
- V8 1 136.5 194.5
- V2 1 136.6 194.6
- V21 1 138.2 196.2
- V5 1 140.2 198.2
- V3 1 141.7 199.7
- V19 1 142.6 200.6
- V6 1 142.7 200.7
- V9 1 142.8 200.8
- V20 1 143.8 201.8
- V7 1 144.2 202.2
- V17 1 144.9 202.9
- V18 1 145.3 203.3
- V16 1 147.2 205.2
- V12 1 147.5 205.5
- V10 1 149.3 207.3
- V11 1 150.8 208.8
- V4 1 174.0 232.0
- V14 1 175.3 233.3
- V26 1 2018.4 2076.4
- V15 1 2523.1 2581.1
- V25 1 2667.2 2725.2
- Amount 1 4541.5 4599.5
glm.fit: fitted probabilities numerically 0 or 1 occurred
Step: AIC=189.5
Class ~ V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + V11 + V12 +
V13 + V14 + V15 + V16 + V17 + V18 + V19 + V20 + V21 + V22 +
V23 + V24 + V25 + V26 + V27 + V28 + Amount
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- V27 1 132.4 188.4
- V13 1 132.6 188.6
<none> 131.5 189.5
- V24 1 133.8 189.8
- V28 1 135.0 191.0
+ Time 1 131.5 191.5
+ V1 1 131.5 191.5
- V23 1 136.1 192.1
- V22 1 136.3 192.3
- V8 1 136.6 192.6
- V2 1 136.7 192.7
- V21 1 138.7 194.7
- V5 1 140.3 196.3
- V19 1 142.7 198.7
- V3 1 142.7 198.7
- V6 1 142.8 198.8
- V9 1 143.0 199.0
- V20 1 143.9 199.9
- V7 1 144.3 200.3
- V17 1 145.1 201.1
- V18 1 145.3 201.3
- V16 1 147.3 203.3
- V12 1 147.5 203.5
- V10 1 149.3 205.3
- V11 1 151.0 207.0
- V14 1 175.3 231.3
- V4 1 175.6 231.6
- V25 1 1946.4 2002.4
- V15 1 2667.2 2723.2
- V26 1 3243.9 3299.9
- Amount 1 6559.9 6615.9
glm.fit: fitted probabilities numerically 0 or 1 occurred
Step: AIC=188.42
Class ~ V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + V11 + V12 +
V13 + V14 + V15 + V16 + V17 + V18 + V19 + V20 + V21 + V22 +
V23 + V24 + V25 + V26 + V28 + Amount
glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- V13 1 132.6 186.6
- V26 1 132.7 186.7
- V24 1 133.8 187.8
<none> 132.4 188.4
+ V27 1 131.5 189.5
- V28 1 136.0 190.0
+ Time 1 132.3 190.3
+ V1 1 132.3 190.3
- V22 1 137.2 191.2
- V21 1 139.0 193.0
- V5 1 140.9 194.9
- V19 1 143.6 197.6
- V9 1 143.6 197.6
- V20 1 143.9 197.9
- V3 1 144.1 198.1
- V7 1 146.3 200.3
- V17 1 146.7 200.7
- V18 1 147.0 201.0
- V6 1 147.6 201.6
- V16 1 149.3 203.3
- V12 1 149.4 203.4
- V10 1 149.4 203.4
- V11 1 153.3 207.3
- V4 1 176.3 230.3
- V14 1 181.7 235.7
- V25 1 2667.2 2721.2
- V8 1 2739.3 2793.3
- V2 1 3243.9 3297.9
- Amount 1 3388.1 3442.1
- V23 1 3604.4 3658.4
- V15 1 3892.7 3946.7
glm.fit: fitted probabilities numerically 0 or 1 occurred
Step: AIC=186.57
Class ~ V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + V11 + V12 +
V14 + V15 + V16 + V17 + V18 + V19 + V20 + V21 + V22 + V23 +
V24 + V25 + V26 + V28 + Amount
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- V26 1 132.9 184.9
- V24 1 133.8 185.8
<none> 132.6 186.6
- V28 1 136.1 188.1
+ Time 1 132.4 188.4
+ V13 1 132.4 188.4
+ V27 1 132.6 188.6
- V22 1 137.5 189.5
- V23 1 137.9 189.9
- V8 1 138.5 190.5
- V25 1 138.7 190.7
- V2 1 138.7 190.7
- Amount 1 139.5 191.5
- V21 1 139.9 191.9
- V15 1 140.7 192.7
- V5 1 141.2 193.2
- V9 1 143.6 195.6
- V19 1 143.7 195.7
- V20 1 144.1 196.1
- V3 1 144.2 196.2
- V7 1 146.4 198.4
- V17 1 146.8 198.8
- V18 1 147.0 199.0
- V6 1 148.1 200.1
- V10 1 149.4 201.4
- V12 1 149.5 201.5
- V16 1 149.5 201.5
- V11 1 153.5 205.5
- V4 1 176.5 228.5
- V14 1 183.6 235.6
+ V1 1 4901.9 4957.9
glm.fit: fitted probabilities numerically 0 or 1 occurred
Step: AIC=184.93
Class ~ V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + V11 + V12 +
V14 + V15 + V16 + V17 + V18 + V19 + V20 + V21 + V22 + V23 +
V24 + V25 + V28 + Amount
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- V24 1 134.69 184.69
<none> 132.93 184.93
+ V26 1 132.57 186.57
+ V13 1 132.68 186.68
+ Time 1 132.70 186.70
+ V27 1 132.85 186.85
- V28 1 138.13 188.13
- V22 1 140.38 190.38
- V23 1 141.10 191.10
- V25 1 141.64 191.64
- V8 1 141.65 191.65
- V21 1 141.93 191.93
- V2 1 142.46 192.46
- V15 1 142.74 192.74
- Amount 1 143.42 193.42
- V5 1 143.58 193.58
- V9 1 146.58 196.58
- V19 1 146.81 196.81
- V20 1 146.93 196.93
- V3 1 148.16 198.16
- V7 1 148.92 198.92
- V18 1 149.49 199.49
- V17 1 150.65 200.65
- V12 1 152.20 202.20
- V6 1 152.23 202.23
- V10 1 153.39 203.39
- V16 1 154.73 204.73
- V11 1 156.31 206.31
- V4 1 179.82 229.82
- V14 1 184.52 234.52
+ V1 1 2523.06 2577.06
glm.fit: fitted probabilities numerically 0 or 1 occurred
Step: AIC=184.69
Class ~ V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + V11 + V12 +
V14 + V15 + V16 + V17 + V18 + V19 + V20 + V21 + V22 + V23 +
V25 + V28 + Amount
glm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurredglm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
<none> 134.69 184.69
+ V24 1 132.93 184.93
+ V26 1 133.85 185.85
- V28 1 138.24 186.24
+ Time 1 134.52 186.52
+ V27 1 134.68 186.68
- V22 1 140.38 188.38
- V23 1 141.14 189.14
- V25 1 141.64 189.64
- V8 1 141.66 189.66
- V21 1 141.95 189.95
- V2 1 142.56 190.56
- V15 1 142.91 190.91
- Amount 1 143.50 191.50
- V5 1 143.58 191.58
- V9 1 146.82 194.82
- V20 1 146.96 194.96
- V19 1 147.08 195.08
- V3 1 148.24 196.24
- V7 1 148.94 196.94
- V18 1 149.66 197.66
- V17 1 150.71 198.71
- V12 1 152.30 200.30
- V6 1 152.43 200.43
- V10 1 153.48 201.48
- V16 1 154.95 202.95
- V11 1 156.39 204.39
- V4 1 179.87 227.87
- V14 1 184.79 232.79
+ V1 1 2378.88 2430.88
+ V13 1 2955.58 3007.58
summary(step)
Call:
glm(formula = Class ~ V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 +
V10 + V11 + V12 + V14 + V15 + V16 + V17 + V18 + V19 + V20 +
V21 + V22 + V23 + V25 + V28 + Amount, family = binomial,
data = newData.train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.8812 -0.1585 0.0000 0.0000 3.0080
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.1915 0.7049 -5.946 2.74e-09 ***
V2 11.8918 4.4644 2.664 0.007729 **
V3 -8.3449 2.9446 -2.834 0.004598 **
V4 7.5880 2.3582 3.218 0.001292 **
V5 -2.6438 1.1723 -2.255 0.024121 *
V6 -6.6407 2.1444 -3.097 0.001956 **
V7 -22.4832 7.9731 -2.820 0.004804 **
V8 5.7429 2.0498 2.802 0.005084 **
V9 -8.9079 3.2077 -2.777 0.005487 **
V10 -20.5296 7.1154 -2.885 0.003911 **
V11 15.1334 5.0775 2.980 0.002878 **
V12 -26.9351 9.1725 -2.936 0.003319 **
V14 -29.6791 9.8085 -3.026 0.002479 **
V15 -1.0802 0.3967 -2.723 0.006466 **
V16 -25.7191 8.7317 -2.945 0.003224 **
V17 -44.5236 15.4379 -2.884 0.003926 **
V18 -17.0165 5.8785 -2.895 0.003795 **
V19 7.1600 2.4632 2.907 0.003652 **
V20 -5.8474 1.7757 -3.293 0.000991 ***
V21 4.1979 1.6353 2.567 0.010256 *
V22 2.2779 1.0011 2.275 0.022877 *
V23 5.9484 2.4089 2.469 0.013536 *
V25 3.1093 1.2414 2.505 0.012258 *
V28 5.6067 3.3646 1.666 0.095632 .
Amount 20.1507 7.1488 2.819 0.004821 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 953.77 on 687 degrees of freedom
Residual deviance: 134.69 on 663 degrees of freedom
AIC: 184.69
Number of Fisher Scoring iterations: 19
logistic_AIC = glm(formula = Class ~ V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 +
V10 + V11 + V12 + V13 + V14 + V15 + V16 + V17 + V18 + V19 +
V20 + V21 + V22 + V23 + V24 + V25 + V26 + V27 + V28 + Amount,
family = binomial, data = newData.validate)
glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic_AIC)
Call:
glm(formula = Class ~ V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 +
V10 + V11 + V12 + V13 + V14 + V15 + V16 + V17 + V18 + V19 +
V20 + V21 + V22 + V23 + V24 + V25 + V26 + V27 + V28 + Amount,
family = binomial, data = newData.validate)
Deviance Residuals:
Min 1Q Median 3Q Max
-8.49 0.00 0.00 0.00 8.49
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -9.536e+14 5.249e+06 -181675792 <2e-16 ***
V2 4.680e+14 5.555e+06 84258151 <2e-16 ***
V3 5.445e+13 3.643e+06 14948307 <2e-16 ***
V4 4.065e+14 3.126e+06 130035833 <2e-16 ***
V5 4.457e+14 5.334e+06 83560078 <2e-16 ***
V6 -1.713e+14 4.615e+06 -37113171 <2e-16 ***
V7 6.566e+12 5.451e+06 1204454 <2e-16 ***
V8 -1.238e+11 2.507e+06 -49374 <2e-16 ***
V9 5.287e+14 4.457e+06 118629461 <2e-16 ***
V10 -3.503e+14 5.734e+06 -61086597 <2e-16 ***
V11 1.669e+14 4.181e+06 39913379 <2e-16 ***
V12 2.246e+14 3.594e+06 62498325 <2e-16 ***
V13 -2.573e+14 4.433e+06 -58053277 <2e-16 ***
V14 -6.228e+13 3.490e+06 -17844088 <2e-16 ***
V15 2.842e+14 4.614e+06 61594632 <2e-16 ***
V16 3.291e+14 5.238e+06 62826892 <2e-16 ***
V17 -2.363e+14 3.667e+06 -64431360 <2e-16 ***
V18 -5.136e+14 5.747e+06 -89367420 <2e-16 ***
V19 3.512e+14 4.410e+06 79633393 <2e-16 ***
V20 -9.236e+13 7.645e+06 -12080093 <2e-16 ***
V21 3.785e+13 3.270e+06 11573155 <2e-16 ***
V22 2.059e+14 6.407e+06 32137775 <2e-16 ***
V23 -6.824e+13 9.310e+06 -7329837 <2e-16 ***
V24 -1.254e+14 7.809e+06 -16052741 <2e-16 ***
V25 1.477e+14 8.252e+06 17903789 <2e-16 ***
V26 3.014e+14 8.790e+06 34286668 <2e-16 ***
V27 -2.955e+14 7.900e+06 -37411823 <2e-16 ***
V28 -5.741e+13 1.727e+07 -3324490 <2e-16 ***
Amount 4.102e+14 1.328e+07 30882173 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 410.34 on 295 degrees of freedom
Residual deviance: 2378.88 on 267 degrees of freedom
AIC: 2436.9
Number of Fisher Scoring iterations: 14
pred <- predict(logistic_AIC, newdata = newData.validate, type = "response")
perf.table <- table(newData.validate$Class,pred>.1)
conf.mat <- data.frame(pred_T=c(perf.table[2,2],perf.table[1,2]), pred_F=c(perf.table[2,1],perf.table[1,1]), row.names = c("act_T","act_F"))
ACC <- (conf.mat[1,1]+conf.mat[2,2])/sum(conf.mat)
Sensitivity <- conf.mat[1,1] /sum(conf.mat[,1])
Specificity <- conf.mat[2,2] / (conf.mat[2,1]+conf.mat[2,2])
df.perf <- data.frame(ACC,Sensitivity,Specificity)
df.perf
find.opt.cutoff <- function(cutoff) {
perf.table <- table(newData.validate$Class, pred > cutoff)
if (ncol(perf.table) == 1) {
if (colnames(perf.table) == "TRUE") {
perf.table <- cbind(c(0, 0), perf.table)
}else{
perf.table <- cbind(perf.table, c(0, 0))
}
}
conf.mat <-
data.frame(
pred_T = c(perf.table[2, 2], perf.table[1, 2]),
pred_F = c(perf.table[2, 1], perf.table[1, 1]),
row.names = c("act_T", "act_F")
)
ACC <- (conf.mat[1, 1] + conf.mat[2, 2]) / sum(conf.mat)
Sensitivity <- conf.mat[1, 1] / sum(conf.mat[, 1])
Specificity <- conf.mat[2, 2] / (conf.mat[2, 1] + conf.mat[2, 2])
df.perf <- c(ACC=ACC, Sensitivity=Sensitivity, Specificity=Specificity)
df.perf
}
cutoff <- seq(0,1,.1)
results <- sapply(cutoff, find.opt.cutoff) %>% t %>% data.frame()
results$cutoff <- cutoff
pl <- ggplot(data=results)+geom_line(aes(x=cutoff,y=ACC),size=1,color="red")+
geom_line(aes(x=cutoff,y=Sensitivity),size=1,color="blue")+
geom_line(aes(x=cutoff,y=Specificity),size=1,color="green")+
ylab('performance')+
theme_bw()
ggplotly(pl)